getwd()
## [1] "/Users/meabhhartney/Desktop/DATA6530"
Sys.time()
## [1] "2023-03-06 22:30:35 EST"
alcoholsales <- readxl::read_excel("data/AlcoholSales.xlsx") 
alcoholsales
alc <- alcoholsales %>% 
  mutate(Quarter = yearquarter(Date)) %>% 
  as_tsibble(index = Quarter) 
alc
alc %>% 
  filter(year(Quarter) <= 2016) -> alc_train 
alc_train
alc %>% 
  filter(year(Quarter) > 2016) -> alc_test 
alc_test
alc %>% 
  autoplot(Sales)

alc %>% 
  autoplot(GDP)

alc %>% 
  autoplot(CPI)

alc %>% 
  autoplot(Sales) + labs(y = "Sales") + 
geom_ribbon(aes(xmin = 18200, xmax = 18900), fill = "pink", alpha = 0.4) +
   annotate("text", x = 18500, y = 2000, label = "Covid-19 Pandemic", col = "red", size = 3)

#geom_ribbon(aes(xmin = 1979.98, xmax = 1989.13), fill = "pink", alpha = 0.4) +
  #annotate("text", x = 1984.5, y = 10, label = "Soviet-Afghan War", col = "red", size = 3)
alc %>% 
  autoplot(GDP) + labs(y = "GDP") + 
geom_ribbon(aes(xmin = 18200, xmax = 18900), fill = "pink", alpha = 0.4) +
   annotate("text", x = 18500, y = 2000, label = "Covid-19 Pandemic", col = "red", size = 3)

alc %>% 
features(Sales, guerrero)

As lamba is very close to 0, a log transformation of the data would be best

alc %>%
  autoplot(log(Sales))

alc_fit <- alc_train %>%
  model(
    Mean = MEAN(log(Sales)),
    `Naïve` = NAIVE(log(Sales)),
    `Seasonal naïve` = SNAIVE(log(Sales)),
    Drift = RW(log(Sales) ~ drift())
    )
alc_fc <- alc_fit %>% 
  forecast(alc_test)
alc_fc %>% 
  autoplot(alc, 
      level = NULL
  ) +
  labs(
    y = "$ (millions)",
    title = "Forecasts for quarterly sales"
  ) +
  guides(colour = guide_legend(title = "Forecast"))      

accuracy(alc_fit)
accuracy(alc_fc, alc)
  1. Time Series Regression Models using the train data (1992-2016)
reg_model0 <- alc_train %>% 
  model(TSLM(log(Sales) ~ trend() + season() + GDP + CPI))
report(reg_model0)
## Series: Sales 
## Model: TSLM 
## Transformation: log(Sales) 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0544804 -0.0118517 -0.0007681  0.0138791  0.0580104 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.317e+00  1.210e-01  52.224  < 2e-16 ***
## trend()       -1.590e-03  1.284e-03  -1.238    0.219    
## season()year2  1.032e-01  6.210e-03  16.623  < 2e-16 ***
## season()year3  1.279e-01  6.238e-03  20.500  < 2e-16 ***
## season()year4  2.318e-01  6.824e-03  33.969  < 2e-16 ***
## GDP            1.757e-07  2.823e-08   6.224 1.38e-08 ***
## CPI            5.192e-03  8.087e-04   6.420 5.67e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02075 on 93 degrees of freedom
## Multiple R-squared: 0.9955,  Adjusted R-squared: 0.9952
## F-statistic:  3409 on 6 and 93 DF, p-value: < 2.22e-16
reg_model1 <- alc_train %>% 
  model(TSLM(log(Sales) ~ season() + GDP+ CPI))
report(reg_model1)
## Series: Sales 
## Model: TSLM 
## Transformation: log(Sales) 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0557126 -0.0122025 -0.0005554  0.0143956  0.0550294 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.451e+00  5.428e-02 118.856  < 2e-16 ***
## season()year2 1.050e-01  6.065e-03  17.308  < 2e-16 ***
## season()year3 1.294e-01  6.138e-03  21.076  < 2e-16 ***
## season()year4 2.344e-01  6.521e-03  35.938  < 2e-16 ***
## GDP           1.502e-07  1.935e-08   7.763 1.00e-11 ***
## CPI           4.485e-03  5.743e-04   7.810 7.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02081 on 94 degrees of freedom
## Multiple R-squared: 0.9954,  Adjusted R-squared: 0.9952
## F-statistic:  4067 on 5 and 94 DF, p-value: < 2.22e-16
fc0 <- forecast(reg_model0,alc_test) 
fc0 %>% 
  autoplot(alc)

accuracy(reg_model0)
accuracy(fc0, alc)
fc1 <- forecast(reg_model1,alc_test) 
fc1 %>% 
  autoplot(alc)

accuracy(reg_model1)
accuracy(fc1, alc)
alc_train %>% 
  autoplot(Sales, col = "blue") +
  geom_line(data = augment(reg_model0), aes(y = .fitted), col = "red") + labs(title = "Regression Model using trend, season, GDP and CPI (reg_model0)")

alc_train %>% 
  autoplot(Sales, col = "blue") +
  geom_line(data = augment(reg_model1), aes(y = .fitted), col = "red") + labs(title = "Regression Model using  season, GDP and CPI (reg_model1)")

alc %>% 
  autoplot(log(Sales))

alc %>% 
  features(Sales,unitroot_kpss)

Small P value, differencing is required

alc %>% 
  features(Sales, unitroot_ndiffs)
alc %>% 
features(Sales, unitroot_nsdiffs)
alc_train %>% 
  autoplot(log(Sales) %>% difference())

alc %>% 
  gg_tsdisplay(difference(log(Sales),4), plot_type='partial', lag = 36) +
  labs(title = "Quarterly sales with differencing", y="")

ARIMA models

fit1 <- alc_train %>%
  model(
    arima011210 = ARIMA(Sales ~ pdq(0,1,1) + PDQ(2,1,0)),
    arima012110 = ARIMA(Sales ~ pdq(0,1,2) + PDQ(1,1,0)),
auto = ARIMA(Sales, stepwise = FALSE, approx = FALSE)
)
fit1 %>% pivot_longer(everything(), names_to = "Model name",
                     values_to = "Orders")
glance(fit1) %>% arrange(AICc) %>% select(.model:BIC)
forecast(fit1, alc_test) %>%
  autoplot(alc) +
  labs(title = "Forecasting quarterly sales with ARIMA models",
       y="$ million")

forecast(fit1, alc_test) %>%
  filter(.model=='auto') %>%
  autoplot(alc) +
  labs(title = "Forecasting quarterly sales with auto ARIMA model",
       y="$ million")

ETS model

fit_ets <-alc %>% 
model(
    SES = ETS(log(Sales) ~ error("A") + trend("N") + season("N")),
    Holt = ETS(log(Sales) ~ error("A") + trend("A") + season("N")),
    Damped = ETS(log(Sales) ~ error("A") + trend("Ad") + season("N")),
    auto = ETS(log(Sales))
)

accuracy(fit_ets) 
forecast(fit_ets, alc_test) %>%
  autoplot(alc)

forecast(fit_ets, alc_test) %>%
  filter(.model=='auto') %>%
  autoplot(alc)

fit_ets %>% 
  forecast(h = 10) %>% 
  autoplot(alc)

fit_ets %>% 
  forecast(h = 10) %>% 
  filter(.model=='auto') %>%
  autoplot(alc)

fitall <- alc_train %>%
  model(
    "Holt" = ETS(log(Sales) ~ error("A") + trend("A") + season("N")),
    "ARIMA" = ARIMA(log(Sales)),
    "naïve" = NAIVE(log(Sales)),
    "Regression" = TSLM(log(Sales) ~ season() + GDP + CPI)  
  )
fcall <- forecast(fitall, alc_test)
accuracy(fitall)
accuracy(fcall, alc)
alc %>%
  autoplot(Sales) +
  autolayer(fcall) +
  labs(x = "Quarter",
    title = "Forecasting Model Comparison",
    y = "$ million"
  )

Regression model is best

fitall %>% select("Regression") %>% report()
## Series: Sales 
## Model: TSLM 
## Transformation: log(Sales) 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0557126 -0.0122025 -0.0005554  0.0143956  0.0550294 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.451e+00  5.428e-02 118.856  < 2e-16 ***
## season()year2 1.050e-01  6.065e-03  17.308  < 2e-16 ***
## season()year3 1.294e-01  6.138e-03  21.076  < 2e-16 ***
## season()year4 2.344e-01  6.521e-03  35.938  < 2e-16 ***
## GDP           1.502e-07  1.935e-08   7.763 1.00e-11 ***
## CPI           4.485e-03  5.743e-04   7.810 7.98e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02081 on 94 degrees of freedom
## Multiple R-squared: 0.9954,  Adjusted R-squared: 0.9952
## F-statistic:  4067 on 5 and 94 DF, p-value: < 2.22e-16